"""
    Analyse words with the highest variance
"""
import os
import pandas as pd
import numpy as np
import ast
import seaborn as sns

import matplotlib.pyplot as plt

def load_all_csv_one_csv(path):
    """
        Loads all csv's from a folder which saved all csv's
    :return:
    """
    all_files = list(sorted(os.listdir(path)))
    out = []
    for x in [path + x for x in all_files]:
        if ".csv" in x:
            print("Including file ", x)
            cur_df = pd.read_csv(x)
            out.append(cur_df)

    # Get the vectors which have the highest cumulative variance
    df = pd.concat(out, axis=0)
    print(df.head())
    print(df.columns)

    df['std_vec'] = df['std_vec'].apply(lambda x: ast.literal_eval(x))
    df['mean_vec'] = df['mean_vec'].apply(lambda x: ast.literal_eval(x))

    df['var_sum'] = [np.mean(x) for x in df['std_vec']]  # .apply(lambda x: np.sum(x))

    print(df.head())
    print(df.columns)

    df = df.sort_values('var_sum', ascending=False)

    print(df.head(100)[['word', 'wordnet_senses', 'var_sum']])
    df.to_csv("./words_by_wordnetsense_var.csv")

def check_correlation(df):
    """
        Check how strong of a correlation there is between var_sum and
    :return:
    """
    df['wordnet_senses'] = df['wordnet_senses'].apply(lambda x: x.split(",")[0][1:])
    print("df is: ")
    df['wordnet_senses'] = df['wordnet_senses'].astype(int)
    print(df.head())
    g = sns.jointplot("wordnet_senses", "var_sum", data=df,
                      kind="reg", truncate=False,
                      # size=3,
                      # xlim=(0, 60), ylim=(0, 12),
                      # color="m", height=7
                      scatter_kws={"s": 3}
                      )
    plt.show()

if __name__ == "__main__":

    path = "./notebooks/2020_03_14 analyse words with most variance/_mean_std_vector_none_768_whitenFalse_norm/"

    print("Starting to analyse which words have highest variance")

    # Load the vectors
    # load_all_csv_one_csv(path)
    df = pd.read_csv("./notebooks/2020_03_14 analyse words with most variance/words_by_wordnetsense_var.csv")
    check_correlation(df)

    # Check which have the highest variance
